Preprocessing Dataset

Libraries

Library for read dataset.

library(readr)

Library for data frames processing.

library(dplyr)
library(tidyr)

Library for R Markdown.

library(rmarkdown)
library(knitr)

Library for data presentation.

library(scales)

Library for manage strings.

library(stringr)

Load dataset

Load dataset.

df <- read_csv("../virusTotal/data/virusTotal.csv")

Statistics

Dimensions.

dim(df)
## [1] 183 447

Types

View witch types are in the dataset.

col_types_all <- 
  df %>% 
  sapply(typeof) %>% 
  unlist()

col_types_table <- 
  col_types_all %>% 
  table()

col_types <- 
  col_types_table %>% 
  as.vector()

names(col_types) <- names(col_types_table)
## character    double   logical 
##       158       204        85

As can be seen there are the three expected types: character, double and logical.

NA

Percentaje of NA values

Define function to see the amount of NA values in the dataframe.

percent_of_NA <- 
  function(df){
    num_of_NA <- 
      df %>% is.na() %>% sum()
    num_of_values <- 
      df %>% dim() %>% prod()
    percent_of_NA <- 
      (num_of_NA / num_of_values) %>% 
      percent()
    return(percent_of_NA)
  }
percent_of_NA(df)
## [1] "37%"

Columns with NA

Define functions to see the NA in columns.

num_of_NA_by_column <- 
  function(df){
    df %>% is.na() %>% colSums()
  }
remove_0 <- 
  function(x) x[x!=0]
names_of_colums_with_NA <- 
  function(df)
    df %>% 
      num_of_NA_by_column() %>% 
      remove_0 %>% 
      names()
percentaje_of_cols_with_NA <-
  function(df)
    (length(names_of_colums_with_NA(df)) / ncol(df)) %>% 
    percent()

Compute the percentaje of cols with NA.

percentaje_of_cols_with_NA(df)
## [1] "74%"

Inspect if there are columns full of NA.

is_full_of_NA <- function(col){
  num_of_NA <- 
    col %>% 
    is.na() %>% 
    sum()
  return(num_of_NA == length(col))
}
cols_full_of_NA <- 
  df %>% 
  select_if(is_full_of_NA) %>% 
  names()
##  [1] "authentihash"                  "scans.Bkav.result"            
##  [3] "scans.CMC.result"              "scans.ALYac.result"           
##  [5] "scans.Malwarebytes.result"     "scans.K7AntiVirus.result"     
##  [7] "scans.Baidu.result"            "scans.SUPERAntiSpyware.result"
##  [9] "scans.Gridinsoft.result"       "scans.ViRobot.result"         
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"         
## [13] "scans.VBA32.result"            "scans.Zoner.result"           
## [15] "scans.Panda.result"            "scans.Elastic.result"         
## [17] "scans.Cylance.result"          "scans.SentinelOne.result"

As can be seen there are many columns that are full of NA, so can be deleted.

df <- 
  df %>% 
  select(-all_of(cols_full_of_NA))

Colums with the same value

Maybe there are columns that has the same value along all the vector, so are useless.

Define function to remove these columns.

remove_columns_with_the_same_value <- 
  function(df)
      select_if(df, function(col) length(unique(col)) > 1)

Apply function.

num_of_cols_after_remove <- 
  df %>% 
  remove_columns_with_the_same_value() %>% 
  ncol()

Calculate the number of columns with same value.

ncol(df) - num_of_cols_after_remove
## [1] 39

Awesom! Many colums found. Let’s remove them.

df <- 
  remove_columns_with_the_same_value(df)

Inspecting dataframe

Now let’s deeply inspect into the dataframe.

Replace NA for 0 into permissions colums

The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.

replace_na_which_colname_match <- 
  function(df, pattern, replacement){
    cols_to_replace <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
replace_na_when <- 
  function(df, fun, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
pattern <- "additional_info.androguard.RiskIndicator.PERM"

df <- 
  df %>% 
  replace_na_which_colname_match(pattern, 0)

df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  paged_table()
df_permissions <- 
  df %>% 
  select(., str_which(colnames(.), "additional_info.androguard.RiskIndicator.PERM")) %>% 
  sapply(function(col) replace_na(col, 0))

df_without_permissions <- 
  df %>% 
  select(., -c(str_which(colnames(.), "additional_info.androguard.RiskIndicator.PERM")))

df <- cbind(df_permissions, df_without_permissions)

As factor

labels <- 
  function(n){
    if(n == 5){
      return(c("very low", "low", "medium", "high", "very high"))
    }else if(n == 4){
      return(c("very low", "low", "high", "very high"))
    }else if(n == 3){
      return(c("low", "medium", "high"))
    }else if(n == 2){
      return(c("low", "high"))
    }else{
      stop("Not avalible")
    }
  }

cut_by_quantiles <- 
  function(col){
    quantiles <- 
      col %>% 
      quantile(na.rm = TRUE) %>% 
      unique()
    if(length(quantiles) > 2){
      col <- 
        col %>% 
        cut(breaks = quantiles, 
            labels = labels(length(quantiles)-1),
            include.lowest = TRUE)
    }
    return(col)
  }

df_cut_by_quantiles <- 
  function(df){
    df_without_numeric <- 
      df[sapply(df, function(col) !is.numeric(col))]
    df_numeric <- 
      df %>% 
      select_if(is.numeric)
    df_numeric <- 
      df_numeric %>% 
      lapply(cut_by_quantiles)
    return(cbind(df_without_numeric, df_numeric))
  }

Col total permissions

pattern <- "additional_info.androguard.RiskIndicator.PERM"

df_without_permissions <- 
  df %>% 
  select(., -(str_which(colnames(.), pattern)))

df_permissions <- 
  df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  mutate(., total_PERMs = rowSums(.))

df <- cbind(df_without_permissions, df_permissions)